The dataset described is the “Official Index of Localities” (Répertoire officiel des localités) provided by the Swiss Federal Office of Topography (swisstopo). It contains comprehensive information about all localities in Switzerland and the Principality of Liechtenstein, including their names, postal codes, and perimeters.
This dataset is regularly updated on a monthly basis, incorporating changes reported by cantonal authorities and Swiss Post. It serves various purposes, including spatial analysis, integration with other geographic datasets, usage as a geolocated background in GIS (Geographic Information Systems) and CAD (Computer-Aided Design) systems, statistical analysis, and as a reference dataset for information systems.
Updates and release notes for the dataset are provided periodically, detailing changes and improvements made over time. The Swiss Federal Office of Topography manages and distributes this dataset as part of its responsibilities in collecting and providing official geospatial data for Switzerland.
2.1.3.1 Creating Variable zip_code and merging with AMTOVZ_CSV_LV95
Code
df <- properties_filtered#the address column is like : '1844 Villeneuve VD' and has zip code number in it#taking out the zip code number and creating a new column 'zip_code'#the way to identify the zip code is to identify numbers that are 4 digits longdf$zip_code <-as.numeric(gsub("\\D", "", df$address))#removing the first two number of zip code has more than 4 numberdf$zip_code <-ifelse(df$zip_code >9999, df$zip_code %%10000, df$zip_code)
2.1.3.2 Using AMTOVZ_CSV_LV95 to get the city and canton from the zip code
Code
#read .csv AMTOVZ_CSV_LV95amto <-read.csv(file.path(here(),"data/AMTOVZ_CSV_WGS84.csv"), sep =";")#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code', 'KantonskÃ.rzel' as 'Canton_code', 'E' as 'lon' and 'N' as 'lat'amto_df <- amto[, c('Gemeindename', 'PLZ', 'Kantonskürzel', 'E', 'N')]#renaming the columnscolnames(amto_df) <-c('Community', 'zip_code', 'Canton_code', 'lon', 'lat')#remove duplicates of zip codeamto_df <- amto_df[!duplicated(amto_df$zip_code),]#add the variable of amto_df to the df if the zip code matchesdf <-merge(df, amto_df, by ="zip_code", all.x =TRUE)#check if there are nan in citydf[is.na(df$Community),]#> zip_code price number_of_rooms square_meters#> 1 25 2200000 10.0 263#> 2 25 2200000 6.5 165#> 3 26 655000 3.5 66#> 4 26 1995000 7.5 180#> 5 322 870000 2.5 59#> 6 322 880000 2.5 55#> 7 322 975000 3.5 56#> 230 1014 1510000 5.5 146#> 1137 1200 679000 5.5 142#> 1138 1200 16092000 7.0 400#> 1139 1200 3285450 5.0 230#> 5479 1919 1065000 4.5 130#> 5480 1919 785000 3.5 103#> 5481 1919 2558620 5.5 270#> 5482 1919 1908000 6.5 210#> 7622 2500 1100000 5.0 154#> 7623 2500 872500 4.5 144#> 7624 2500 420000 4.5 115#> 7625 2500 1450000 5.5 198#> 7626 2500 885500 5.5 130#> 7627 2500 872500 4.5 138#> 7628 2500 892500 4.5 144#> 7629 2500 885500 5.5 130#> 7630 2500 877500 4.5 138#> 7631 2500 870500 4.5 125#> 7632 2500 887500 5.5 130#> 7633 2500 887500 4.5 144#> 7634 2500 1050000 4.5 121#> 8326 3000 820000 5.5 165#> 8327 3000 1140000 3.5 115#> 8328 3000 1090000 3.5 115#> 8329 3000 920000 4.5 157#> 8330 3000 1090000 5.5 193#> 8331 3000 1090000 5.5 193#> 8332 3000 920000 4.5 157#> 8333 3000 720000 3.5 102#> 8334 3000 1590000 5.5 330#> 10435 4000 180000 3.0 70#> 10436 4000 975000 4.5 125#> 10437 4000 2100000 6.5 360#> 12360 5201 725000 3.5 95#> 13213 6000 695000 4.5 133#> 13966 6511 440000 2.0 64#> 14242 6547 15000000 7.5 220#> 14560 6602 2800000 6.5 250#> 14561 6602 2800000 7.5 242#> 14562 6602 270000 1.5 28#> 14563 6602 450000 3.5 75#> 14564 6604 1990000 4.5 220#> 14565 6604 2668590 5.5 290#> 14566 6604 760000 3.5 78#> 16579 6901 3660930 4.5 290#> 16580 6901 3660930 4.5 290#> 16581 6903 790000 3.5 105#> 16582 6907 995000 4.5 114#> 16583 6907 995000 4.5 114#> 16584 6911 469350 5.5 140#> 16585 6911 737550 4.5 82#> 16586 6911 660000 7.5 200#> 16587 6911 610000 3.5 103#> 17896 7133 2266290 5.5 160#> 17905 7135 2690000 8.5 236#> 18165 8000 2100000 4.5 152#> 18166 8000 1650000 4.5 142#> 18167 8000 925000 3.5 102#> 18168 8000 1650000 4.5 142#> 18169 8000 1150000 4.5 128#> 18170 8000 1450000 5.5 143#> 18171 8000 1990000 5.5 200#> 18172 8000 1990000 5.5 200#> 18173 8000 975000 4.5 122#> 18174 8000 2495000 5.5 482#> 18654 8238 245000 2.0 49#> 19078 8423 2110000 6.5 204#> 19079 8423 2190000 5.5 167#> 20292 9241 545000 4.5 100#> 20293 9241 730840 5.5 130#> address#> 1 1000 Lausanne 25#> 2 1000 Lausanne 25#> 3 1000 Lausanne 26#> 4 Lausanne 26, 1000 Lausanne 26#> 5 Via Cuolm Liung 30d, 7032 Laax GR 2#> 6 7032 Laax GR 2#> 7 Via Murschetg 29, 7032 Laax GR 2#> 230 1014 Lausanne#> 1137 Chemin des pralets, 74100 Etrembières, 1200 Genève#> 1138 1200 Genève#> 1139 1200 Genève#> 5479 1919 Martigny#> 5480 1919 Martigny#> 5481 1919 Martigny#> 5482 1919 Martigny#> 7622 2500 Biel/Bienne#> 7623 2500 Biel/Bienne#> 7624 2500 Biel/Bienne#> 7625 2500 Bienne#> 7626 2500 Biel/Bienne#> 7627 2500 Biel/Bienne#> 7628 2500 Biel/Bienne#> 7629 2500 Biel/Bienne#> 7630 2500 Biel/Bienne#> 7631 2500 Biel/Bienne#> 7632 2500 Biel/Bienne#> 7633 2500 Biel/Bienne#> 7634 Hohlenweg 11b, 2500 Biel/Bienne#> 8326 3000 Bern#> 8327 3000 Bern#> 8328 3000 Bern#> 8329 3000 Bern#> 8330 3000 Bern#> 8331 3000 Bern#> 8332 3000 Bern#> 8333 3000 Bern#> 8334 3000 Bern#> 10435 Lörrach Brombach Steinsack 6, 4000 Basel#> 10436 4000 Basel#> 10437 4000 Basel#> 12360 5201 Brugg AG#> 13213 in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern#> 13966 6511 Cadenazzo#> 14242 Augio 1F, 6547 Augio#> 14560 6602 Muralto#> 14561 6602 Muralto#> 14562 6602 Muralto#> 14563 Via Bacilieri 2, 6602 Muralto#> 14564 6604 Solduno#> 14565 6604 Solduno#> 14566 6604 Locarno#> 16579 6901 Lugano#> 16580 6901 Lugano#> 16581 6903 Lugano#> 16582 6907 MASSAGNO#> 16583 6907 MASSAGNO#> 16584 6911 Campione d'Italia#> 16585 6911 Campione d'Italia#> 16586 6911 Campione d'Italia#> 16587 6911 Campione d'Italia#> 17896 Inder Platenga 34, 7133 Obersaxen#> 17905 7135 Fideris#> 18165 8000 Zürich#> 18166 8000 Zürich#> 18167 8000 Zürich#> 18168 8000 Zürich#> 18169 8000 Zürich#> 18170 8000 Zürich#> 18171 8000 Zürich#> 18172 8000 Zürich#> 18173 8000 Zürich#> 18174 8000 Zürich#> 18654 Stemmerstrasse 14, 8238 Büsingen am Hochrhein#> 19078 Chüngstrasse 60, 8423 Embrach#> 19079 Chüngstrasse 48, 8423 Embrach#> 20292 9241 Kradolf#> 20293 9241 Kradolf#> canton property_type floor year_category Community#> 1 Vaud Single house 1919-1945 <NA>#> 2 Vaud Villa 2006-2010 <NA>#> 3 Vaud Apartment noteg 2016-2024 <NA>#> 4 Vaud Villa 1961-1970 <NA>#> 5 Grisons Apartment eg 2016-2024 <NA>#> 6 Grisons Apartment noteg 2016-2024 <NA>#> 7 Grisons Apartment noteg 2011-2015 <NA>#> 230 Vaud Apartment eg 2011-2015 <NA>#> 1137 Geneva Bifamiliar house 2016-2024 <NA>#> 1138 Geneva Single house 2011-2015 <NA>#> 1139 Geneva Bifamiliar house 1981-1990 <NA>#> 5479 Valais Apartment noteg 2016-2024 <NA>#> 5480 Valais Apartment noteg 2016-2024 <NA>#> 5481 Valais Attic flat noteg 2016-2024 <NA>#> 5482 Valais Apartment noteg 2016-2024 <NA>#> 7622 Bern Single house 2001-2005 <NA>#> 7623 Bern Villa 2016-2024 <NA>#> 7624 Bern Apartment noteg 1971-1980 <NA>#> 7625 Bern Single house 2016-2024 <NA>#> 7626 Bern Villa 2016-2024 <NA>#> 7627 Bern Single house 2016-2024 <NA>#> 7628 Bern Single house 2016-2024 <NA>#> 7629 Bern Single house 2016-2024 <NA>#> 7630 Bern Single house 2016-2024 <NA>#> 7631 Bern Single house 2016-2024 <NA>#> 7632 Bern Single house 2016-2024 <NA>#> 7633 Bern Single house 2016-2024 <NA>#> 7634 Bern Single house 2001-2005 <NA>#> 8326 Bern Apartment noteg 2016-2024 <NA>#> 8327 Bern Apartment eg 2016-2024 <NA>#> 8328 Bern Apartment eg 2016-2024 <NA>#> 8329 Bern Apartment noteg 2016-2024 <NA>#> 8330 Bern Roof flat noteg 2016-2024 <NA>#> 8331 Bern Apartment noteg 2016-2024 <NA>#> 8332 Bern Duplex noteg 2016-2024 <NA>#> 8333 Bern Apartment eg 2016-2024 <NA>#> 8334 Bern Apartment noteg 1991-2000 <NA>#> 10435 Basel-Stadt Single house 1961-1970 <NA>#> 10436 Basel-Stadt Single house 2016-2024 <NA>#> 10437 Basel-Stadt Villa 2016-2024 <NA>#> 12360 Aargau Apartment noteg 2016-2024 <NA>#> 13213 Lucerne Apartment noteg 1991-2000 <NA>#> 13966 Ticino Apartment noteg 2016-2024 <NA>#> 14242 Grisons Single house 2016-2024 <NA>#> 14560 Ticino Single house 1981-1990 <NA>#> 14561 Ticino Single house 1981-1990 <NA>#> 14562 Ticino Apartment eg 1961-1970 <NA>#> 14563 Ticino Apartment noteg 1946-1960 <NA>#> 14564 Ticino Attic flat noteg 2011-2015 <NA>#> 14565 Ticino Apartment noteg 2011-2015 <NA>#> 14566 Ticino Apartment noteg 2011-2015 <NA>#> 16579 Ticino Attic flat noteg 2011-2015 <NA>#> 16580 Ticino Apartment noteg 2011-2015 <NA>#> 16581 Ticino Apartment noteg 2006-2010 <NA>#> 16582 Ticino Apartment noteg 2016-2024 <NA>#> 16583 Ticino Apartment noteg 2016-2024 <NA>#> 16584 Ticino Apartment noteg 1946-1960 <NA>#> 16585 Ticino Apartment noteg 1991-2000 <NA>#> 16586 Ticino Single house 1971-1980 <NA>#> 16587 Ticino Apartment eg 1946-1960 <NA>#> 17896 Grisons Single house 2006-2010 <NA>#> 17905 Grisons Single house 0-1919 <NA>#> 18165 Zurich Apartment noteg 2016-2024 <NA>#> 18166 Zurich Attic flat noteg 2016-2024 <NA>#> 18167 Zurich Apartment noteg 2016-2024 <NA>#> 18168 Zurich Apartment noteg 2016-2024 <NA>#> 18169 Zurich Apartment noteg 2016-2024 <NA>#> 18170 Zurich Apartment eg 2016-2024 <NA>#> 18171 Zurich Attic flat noteg 2006-2010 <NA>#> 18172 Zurich Apartment noteg 2006-2010 <NA>#> 18173 Zurich Single house 2016-2024 <NA>#> 18174 Zurich Apartment noteg 0-1919 <NA>#> 18654 Schaffhausen Apartment noteg 1961-1970 <NA>#> 19078 Zurich Bifamiliar house 2016-2024 <NA>#> 19079 Zurich Single house 2016-2024 <NA>#> 20292 Thurgau Apartment noteg 1991-2000 <NA>#> 20293 Thurgau Apartment noteg 1991-2000 <NA>#> Canton_code lon lat#> 1 <NA> NA NA#> 2 <NA> NA NA#> 3 <NA> NA NA#> 4 <NA> NA NA#> 5 <NA> NA NA#> 6 <NA> NA NA#> 7 <NA> NA NA#> 230 <NA> NA NA#> 1137 <NA> NA NA#> 1138 <NA> NA NA#> 1139 <NA> NA NA#> 5479 <NA> NA NA#> 5480 <NA> NA NA#> 5481 <NA> NA NA#> 5482 <NA> NA NA#> 7622 <NA> NA NA#> 7623 <NA> NA NA#> 7624 <NA> NA NA#> 7625 <NA> NA NA#> 7626 <NA> NA NA#> 7627 <NA> NA NA#> 7628 <NA> NA NA#> 7629 <NA> NA NA#> 7630 <NA> NA NA#> 7631 <NA> NA NA#> 7632 <NA> NA NA#> 7633 <NA> NA NA#> 7634 <NA> NA NA#> 8326 <NA> NA NA#> 8327 <NA> NA NA#> 8328 <NA> NA NA#> 8329 <NA> NA NA#> 8330 <NA> NA NA#> 8331 <NA> NA NA#> 8332 <NA> NA NA#> 8333 <NA> NA NA#> 8334 <NA> NA NA#> 10435 <NA> NA NA#> 10436 <NA> NA NA#> 10437 <NA> NA NA#> 12360 <NA> NA NA#> 13213 <NA> NA NA#> 13966 <NA> NA NA#> 14242 <NA> NA NA#> 14560 <NA> NA NA#> 14561 <NA> NA NA#> 14562 <NA> NA NA#> 14563 <NA> NA NA#> 14564 <NA> NA NA#> 14565 <NA> NA NA#> 14566 <NA> NA NA#> 16579 <NA> NA NA#> 16580 <NA> NA NA#> 16581 <NA> NA NA#> 16582 <NA> NA NA#> 16583 <NA> NA NA#> 16584 <NA> NA NA#> 16585 <NA> NA NA#> 16586 <NA> NA NA#> 16587 <NA> NA NA#> 17896 <NA> NA NA#> 17905 <NA> NA NA#> 18165 <NA> NA NA#> 18166 <NA> NA NA#> 18167 <NA> NA NA#> 18168 <NA> NA NA#> 18169 <NA> NA NA#> 18170 <NA> NA NA#> 18171 <NA> NA NA#> 18172 <NA> NA NA#> 18173 <NA> NA NA#> 18174 <NA> NA NA#> 18654 <NA> NA NA#> 19078 <NA> NA NA#> 19079 <NA> NA NA#> 20292 <NA> NA NA#> 20293 <NA> NA NA
We have 77 NAN, where
The zip code was not found in the atmo df
The zip code was incorectly isolated from the address
Removed them ::: {.cell layout-align=“center”}
Code
#remove the rows with nan in cityproperties_filtered <- df[!is.na(df$Community),]reactable(head(properties_filtered, 100))
# read csvimpots <-read.csv(file.path(here(),"data/estv_income_rates.csv"), sep =",", header =TRUE, stringsAsFactors =FALSE)# Remove 1st rowimpots <- impots[-1, ]# Remove 3rd columnimpots <- impots[, -3]# Combine text for columns 4-8impots[1, 4:8] <-"Impôt sur le revenu"# Combine text for columns 9-13impots[1, 9:13] <-"Impôt sur la fortune"# Combine text for columns 14-16impots[1, 14:16] <-"Impôt sur le bénéfice"# Combine text for columns 17-19impots[1, 17:19] <-"Impôt sur le capital"# Combine content of the first 2 rows into the 2nd rowimpots[2, ] <-apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep =" ")))))# Remove 1st rowimpots <- impots[-1, ]# Assign the text to the 1st row and 1st columnimpots[1, 1] <-"Coefficient d'impôt en %"# Replace column names with the content of the first rowcolnames(impots) <- impots[1, ]impots <- impots[-1, ]# Check for missing values in impotsany_missing <-any(is.na(impots))if (any_missing) {print("There are missing values in impots.")} else {print("There are no missing values in impots.")}#> [1] "There are no missing values in impots."# Replace row names with the content of the 3rd columnrow.names(impots) <- impots[, 3]impots <- impots[, -3]# Remove 2nd column (to avoid canton column)impots <- impots[, -2]# Remove impot egliseimpots <- impots[, -c(4:6)]impots <- impots[, -c(6:8)]impots <- impots[, -8]impots <- impots[, -10]# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))# Replace NA values with 0cleaned_impots[is.na(cleaned_impots)] <-0# Check for non-numeric valuesnon_numeric <-sum(!is.na(cleaned_impots) &!is.numeric(cleaned_impots))if (non_numeric >0) {print(paste("Warning: Found", non_numeric, "non-numeric values."))}rownames(cleaned_impots) <-rownames(impots)#reactable(head(cleaned_impots, 100))
2.1.5 Commune Data
2.1.5.1 Cleaning
ajouter source
ajouter description
expliquer blabla
Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
Code
# il faudra changer le pathcommune_prep <-read.csv(file.path(here(),"data/commune_data.csv"), sep =";", header =TRUE, stringsAsFactors =FALSE)# We keep only 2019 to have some reference? (2020 is apparently not really complete)commune_2019 <-subset(commune_prep, PERIOD_REF =="2019") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonnecommune_2019 <-subset(commune_2019, STATUS =="A") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# on enlève les lignes qui sont des aggrégatscommune_2019 <-subset(commune_2019, REGION !="Schweiz")commune_2019 <- commune_2019 %>%pivot_wider(names_from = INDICATORS, values_from = VALUE)# Rename columns using the provided mapcommune <- commune_2019 %>%rename(`Population - Habitants`= Ind_01_01,`Population - Densité de la population`= Ind_01_03,`Population - Etrangers`= Ind_01_08,`Population - Part du groupe d'âge 0-19 ans`= Ind_01_04,`Population - Part du groupe d'âge 20-64 ans`= Ind_01_05,`Population - Part du groupe d'âge 65+ ans`= Ind_01_06,`Population - Taux brut de nuptialité`= Ind_01_09,`Population - Taux brut de divortialité`= Ind_01_10,`Population - Taux brut de natalité`= Ind_01_11,`Population - Taux brut de mortalité`= Ind_01_12,`Population - Ménages privés`= Ind_01_13,`Population - Taille moyenne des ménages`= Ind_01_14,`Sécurité sociale - Taux d'aide sociale`= Ind_11_01,`Conseil national - PLR`= Ind_14_01,`Conseil national - PDC`= Ind_14_02,`Conseil national - PS`= Ind_14_03,`Conseil national - UDC`= Ind_14_04,`Conseil national - PEV/PCS`= Ind_14_05,`Conseil national - PVL`= Ind_14_06,`Conseil national - PBD`= Ind_14_07,`Conseil national - PST/Sol.`= Ind_14_08,`Conseil national - PES`= Ind_14_09,`Conseil national - Petits partis de droite`= Ind_14_10)# If no one voted for a party, set as NA -> replacing it with 0 insteadcommune <- commune %>%mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# Removing NAs from Taux de couverture sociale column# Setting the mean as the mean for Switzerland in 2019 (3.2%)mean_taux_aide_social <-3.2# Replace NA values with the meancommune <- commune %>%mutate(`Sécurité sociale - Taux d'aide sociale`=if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#show 100 first rows of commune using reactablereactable(head(commune, 100))
Code
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)# # # We keep only 2019 to have some reference? (2020 is apparently not really complete)# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# # # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne# commune_2019 <- subset(commune_2019, STATUS == "A") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# # # on enlève les lignes qui sont des aggrégats# commune_2019 <- subset(commune_2019, REGION != "Schweiz")# # commune_2019 <- commune_2019 %>%# pivot_wider(names_from = INDICATORS, values_from = VALUE)# # # Rename columns using the provided map# commune <- commune_2019 %>%# rename(`Population - Habitants` = Ind_01_01,# `Population - Densité de la population` = Ind_01_03,# `Population - Etrangers` = Ind_01_08,# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,# `Population - Taux brut de nuptialité` = Ind_01_09,# `Population - Taux brut de divortialité` = Ind_01_10,# `Population - Taux brut de natalité` = Ind_01_11,# `Population - Taux brut de mortalité` = Ind_01_12,# `Population - Ménages privés` = Ind_01_13,# `Population - Taille moyenne des ménages` = Ind_01_14,# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,# `Conseil national - PLR` = Ind_14_01,# `Conseil national - PDC` = Ind_14_02,# `Conseil national - PS` = Ind_14_03,# `Conseil national - UDC` = Ind_14_04,# `Conseil national - PEV/PCS` = Ind_14_05,# `Conseil national - PVL` = Ind_14_06,# `Conseil national - PBD` = Ind_14_07,# `Conseil national - PST/Sol.` = Ind_14_08,# `Conseil national - PES` = Ind_14_09,# `Conseil national - Petits partis de droite` = Ind_14_10)# # # If no one voted for a party, set as NA -> replacing it with 0 instead# commune <- commune %>%# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# # # # Removing NAs from Taux de couverture sociale column# # Setting the mean as the mean for Switzerland in 2019 (3.2%)# mean_taux_aide_social <- 3.2# # # Replace NA values with the mean# commune <- commune %>%# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#
3 Unsupervised learning
Clustering and/or dimension reduction
Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities
A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?
Code
set.seed(123)# Clustering demographiccols_commune_demographic <-select(commune, -c("REGION", "CODE_REGION","Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))# Scale the columns, some are total numbers, some are percentagescols_commune_demographic <-scale(cols_commune_demographic)# Calculate the distance matrixdist_matrix_demographic <-dist(cols_commune_demographic, method ="minkowski")# Perform hierarchical clusteringhclust_model_demographic <-hclust(dist_matrix_demographic, method ="ward.D")# Create dendrogramdend_demo <-as.dendrogram(hclust_model_demographic)dend_demo <-color_branches(dend_demo, k =5) #Set number of cluster to 5, to keep the same scale for all our variablesplot(dend_demo, main ="Demographics - Hierarchical Clustering Dendrogram")
Code
# Clustering politicsset.seed(123)cols_commune_politics <-select(commune, c("Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))# Scale the columns, some are total numbers, some are percentagescols_commune_politics <-scale(cols_commune_politics)# Calculate the distance matrixdist_matrix_politics <-dist(cols_commune_politics, method ="minkowski")# Perform hierarchical clusteringhclust_model_politics <-hclust(dist_matrix_politics, method ="ward.D")# Create dendrogramdend_pol <-as.dendrogram(hclust_model_politics)dend_pol <-color_branches(dend_pol, k =5) #Set number of cluster to 5, to keep the same scale for all our variablesplot(dend_pol, main ="Politics - Hierarchical Clustering Dendrogram")
To prevent introducing 10 new types of taxes, we conducted a clustering analysis on the tax dataset to identify which municipalities can be grouped together. Based on the within-cluster sum of squares, we found 5 clusters. These 5 distinct clusters will be assigned to properties to determine which municipalities are subject to a particular type of tax. ## Tax ::: {.cell layout-align=“center”}
Code
set.seed(123)# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))cleaned_impots[is.na(cleaned_impots)] <-0# Replace NA values with 0# Scale the featuresscaled_impots <-scale(cleaned_impots)# Perform k-means clusteringk <-2# Initial guess for the number of clusterskmeans_model <-kmeans(scaled_impots, centers = k)# Check within-cluster sum of squares (elbow method)wss <-numeric(10)for (i in1:10) { kmeans_model <-kmeans(scaled_impots, centers = i) wss[i] <-sum(kmeans_model$withinss)}#plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")# Adjust k based on elbow methodk <-5# Perform k-means clustering again with optimal kkmeans_model <-kmeans(scaled_impots, centers = k)# Assign cluster labels to dendrogramclusters <- kmeans_model$cluster# Plot dendrogram#colored_dend <- color_branches(dend, k = 5)#y_zoom_range <- c(0, 80) # Adjust the y-axis range as needed#plot(colored_dend, main = "Hierarchical Clustering Dendrogram", horiz = FALSE, ylim = y_zoom_range)
# Preparing df_commune for merging with main datasetdf_commune <-select(commune, REGION)df_commune$Demographic_cluster <-cutree(hclust_model_demographic, k =5)df_commune$Political_cluster <-cutree(hclust_model_politics, k =5)# Preparing to mergemerging <-inner_join(amto_df, df_commune, by =c("Community"="REGION"))impots_cluster_subset <- impots_cluster[, c("Community", "cluster")]merging <- merging %>%left_join(impots_cluster_subset, by ="Community")clusters_df <- merging %>%rename(Tax_cluster = cluster) %>%rename(Commune = Community)clusters_df <- clusters_df %>%select(c("Commune", "zip_code", "Canton_code", "Demographic_cluster", "Political_cluster", "Tax_cluster"))# Only NAs are for commune Brugg, (written Brugg (AG) in the other data set) -> j'entre le cluster à la manoclusters_df$Tax_cluster[is.na(clusters_df$Tax_cluster)] <-2# adding it to our main data set:properties_filtered <-merge(properties_filtered, clusters_df[, c("zip_code", "Demographic_cluster", "Political_cluster", "Tax_cluster")], by ="zip_code", all.x =TRUE)
Code
# Dropping 228 rows containing NAs after the mergena_count <-sum(is.na(properties_filtered[, c("Demographic_cluster", "Political_cluster", "Tax_cluster")]))# Print the resultif (na_count >0) {print("There are NA values in the merged dataframe.")print(na_count)} else {print("There are no NA values in the merged dataframe.")}#> [1] "There are NA values in the merged dataframe."#> [1] 675# Find rows with NA values in the specified columnsna_rows <-subset(properties_filtered, is.na(Demographic_cluster) |is.na(Political_cluster) |is.na(Tax_cluster))# Drop the NA rowsproperties_filtered <-anti_join(properties_filtered, na_rows, by ="zip_code")
4 EDA
4.1 Map representation of distribution of properties
Code
# Create a leaflet map with optimized markersmap <-leaflet(properties_filtered) %>%addTiles() %>%# Add default OpenStreetMap tilesaddProviderTiles(providers$Esri.NatGeoWorldMap) %>%# Add topographic maps for contextaddCircleMarkers(~lon, ~lat,radius =1.5, # Smaller radius for the circle markerscolor ="#32012F", # Specifying a color for the markersfillOpacity =0.2, # Semi-transparent fillstroke =FALSE, # No border to the circle markers to reduce visual noisepopup =~paste("Price: ", price, "<br>","Rooms: ", number_of_rooms, "<br>","Type: ", property_type, "<br>","Year: ", year_category),label =~paste("Price: ", price) # Tooltip on hover ) %>%addLegend(position ="bottomright", # Position the legend at the bottom rightcolors ="#32012F", # Use the same color as the markerslabels ="Properties"# Label for the legend )map$width <-"100%"# Set the width of the map to 100%map$height <-600# Set the height of the map to 600 pixelsmap
4.2 Histogram of prices
Code
histogram_price <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="red") +labs(title ="Distribution of Prices",x ="Price",y ="Frequency") +theme_minimal()# Convert ggplot object to plotly objectinteractive_histogram_price <-ggplotly(histogram_price, width =600, height =400 )# Display the interactive histograminteractive_histogram_price
4.3 Histogram of prices for each property type
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create the ggplot objecthistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ property_type, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Property Type",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram
4.4 Histogram of prices for each year category
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create a histogram of prices for each year categoryhistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ year_category, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Year Category",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram_year <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram_year
4.5 Histogram of prices for each canton
note : only price between 0 and 500000 so some outliers aren’t here
Code
histogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ canton, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Canton for properties between 0 and 5 million",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000) %>%layout(height =1000) # Adjust the height as needed# Display the interactive plotinteractive_histogram
4.6 Histogram of prices for each number of rooms
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create a histogram of prices for each number of roomshistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ number_of_rooms, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Number of Rooms",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000)%>%layout(height =2000)# Display the interactive plotinteractive_histogram
4.7 Histogram of properties by square meters
Code
histogram <-ggplot(properties_filtered, aes(x = square_meters)) +geom_histogram(binwidth =15, fill ="skyblue", color ="black") +labs(title ="Distribution of Properties by Square Meters",x ="Square Meters",y ="Frequency") +theme_minimal() +xlim(0, 2000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =NULL, height =NULL) # Adjust width and height as needed# Display the interactive plotinteractive_histogram
4.8 Histogram of prices with impot
Code
# Create the boxplotboxplot <-ggplot(properties_filtered, aes(x =as.factor(Tax_cluster), y = price)) +geom_boxplot(fill ="skyblue", color ="black") +labs(title ="Boxplot of Property Prices by Tax Cluster",x ="Tax Cluster",y ="Price") +theme_minimal() +ylim(100000, 400000)# Convert ggplot object to plotly objectinteractive_boxplot <-ggplotly(boxplot)interactive_boxplot
Code
impot_cols <-names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]# Count the number of NA values in selected columnsna_counts <-colSums(is.na(properties_filtered[impot_cols]))# Print the countsprint(na_counts)#> numeric(0)